\documentclass[final, 5p, sort&compress, times]{elsarticle}
\usepackage[english]{babel}
\usepackage[latin1]{inputenc} 
\usepackage[T1]{fontenc} 
\usepackage{verbatim}
\usepackage{amsmath, amsthm, amssymb, amsfonts}
\usepackage{graphicx, epsfig}
\usepackage{mbboard, calligra}
%\usepackage[matha,mathb]{mathabx}
\usepackage{longtable}
\usepackage{lmodern}
\usepackage{hyperref}
\usepackage[babel=true]{csquotes} 

\newcommand{\logr}{\sffamily {R}\rmfamily}
\newcommand{\nor}{\mathcal{N}}
\newcommand{\one}[1]{\textbf{1}_{#1}} % vecteur colonne de 1s
\newcommand{\tr}[1]{#1^\top} % transpos\'ee d'une matrice ou d'un vecteur
\newcommand{\mat}[1]{\textbf{#1}} % matrices, en lettres grasses
\newcommand{\diag}[1]{\text{diag}\left(#1\right)}
\newcommand{\rr}{\mathbb{R}}
\newcommand{\nn}{\mathbb{N}}
\newcommand{\pps}{\mathbb{P}}
\newcommand{\pp}[1]{\mathbb{P}\left\{ #1 \right\}}
\newcommand{\esp}[1]{\mathbb{E}\left[ #1 \right]}
\newcommand{\std}[1]{\widehat{\text{se}}\left(#1\right)}
\newcommand{\stdb}[1]{\widehat{\text{se}}_{\text{boot}}\left(#1\right)}
\newcommand{\moyn}{\overline{X}_n}
\newcommand{\var}[1]{\mathbb{V}\left(#1\right)}
\renewcommand{\hat}{\widehat}

\renewcommand{\epsilon}{\varepsilon}

\journal{Forensic Science International}

\begin{document}

\begin{frontmatter}
\title{Comparison of the performance of classification trees, linear discriminant analysis, logistic regression and support vector machines, applied to sex determination from skull in 4 modern populations.}

\author[pacea]{Fr\'ed\'eric Santos\corref{cor1}}
\ead{f.santos@pacea.u-bordeaux1.fr}

\author[pacea]{Pierre Guyomarc'h}
\ead{pierreguyo@gmail.com}

\author[pacea]{Jaroslav Bruzek}
\ead{j.bruzek@pacea.u-bordeaux1.fr}

\cortext[cor1]{Corresponding author}
\address[pacea]{Univ. Bordeaux -- CNRS -- MCC, PACEA, UMR 5199, F-33400 Talence, France.} 

\begin{abstract} 
[\`A r\'ediger tous ensemble.]
\end{abstract}

\begin{keyword}
Later
\end{keyword}

\end{frontmatter}


%%%%%%%% 
% TEXT
%%%%%%%%

\section{Introduction}
[Partie \`a r\'ediger en collaboration : Jaro et Pierre pour la probl\'ematique anthropo/bio, moi pour un \'etat de l'art des m\'ethodes statistiques et un rapide survol de notre d\'emarche globale.]

\section{Material and methods}
\subsection{Data}
The study sample consists in four different sub-samples of known age and sex, extracted from osteological reference collections around the world:
\begin{itemize}
\item The Coimbra Identified Skeletons Collection (University of Coimbra, Portugal), composed of Portuguese individuals deceased during the first half of the 20th c.
\item The Olivier collection (Mus\'ee de l'Homme, Museum National d'Histoire Naturelle, Paris, France), composed of French individuals deceased during the mid 20th c.
\item The Maxwell Museum's Documented Skeletal Collection (Albuquerque, New Mexico, USA), composed of American residents deceased at the end of the 20th c.
\item The Thai skeletal collection hosted by the department of Anatomy of the University of Chiang-Mai (Thailand), composed of Thai individuals deceased at the end of the 20th c.
The sex and age repartition of each sample is presented in the Table \ref{tab:datadesc}.
\end{itemize}

\begin{table}[h]
\sf
\begin{tabular}{|c|c|c|c|c|c|}
\hline
Popu- & \multicolumn{2}{c|}{Male} & \multicolumn{2}{c|}{Female} & Total \\
\cline{2-5} 
lation & $n$ & age: $m$ (sd) & $n$ & age: $m$ (sd) & ($n$) \\
\hline
POR & 53 & 51 (18) & 54 & 57 (20) & 66 \\
\hline
FRA & 25 & 50 (10) & 25 & 56 (13) & 50 \\
\hline
USA & 33 & 54 (13) & 33 & 69 (17) & 107 \\
\hline
THA & 47 & 73 (14) & 45 & 64 (15) & 92 \\
\hline
Total ($n$) & \multicolumn{2}{c|}{158} & \multicolumn{2}{c|}{157} & 315 \\
\hline
\end{tabular}
\caption{Sex repartition and mean age of the sample by population (POR = Portugal; FRA = France; USA = United States; THA = Thailand; $m$ = mean; sd = standard deviation; age in years).}
\label{tab:datadesc}
\end{table}

Craniometric variables of the Portuguese, American and Thai samples were measured in millimetres using callipers. The French sample was measured with a digitizer, and linear measurements were extracted from the 3D coordinates data of the corresponding landmarks (see \cite{Franklin05, Guyo10} for description and validation). Sixteen traditional measures are used in this study; codes and definitions are presented in the Table \ref{tab:mesdesc}.

\begin{table}[h]
\sf
\begin{tabular}{|c|c|c|}
\hline
\multicolumn{2}{|c|}{Code} &  \\
\cline{1-2}
Martin \cite{Brauer88} & Howells \cite{Howells73} & Definition \\
\hline
M1 & GOL & Glabello-Occipital Length \\
\hline
M5 & BNL & Basion-Nasion Length \\
\hline
M8 & XCB & Maximum Cranial Breadth \\
\hline
M9 & WFB & Minimum Frontal Breadth \\
\hline
M12 & ASB & Biasterionic Breadth \\
\hline
M17 & BBH & Basion-Bregma Height \\
\hline
M11 & AUB & Biauricular Breadth \\
\hline
M29 & FRC & Nasion-Bregma Chord \\
\hline
M30 & PAC & Bregma-Lambda Chord \\
\hline
M31 & OCC & Lambda-Opisthion Chord \\
\hline
M40 & BPL & Basion-Prosthion Length \\
\hline
M45 & ZYB & Bizygomatic Breadth \\
\hline
M48 & NPH & Nasion-Prosthion Height \\
\hline
M52 & OBH & Orbit Height (left) \\
\hline
M54 & NLB & Nasal Breadth \\
\hline
M61 & MAB & Palate Breadth \\
\hline
\end{tabular}
\caption{Craniometric measurements codes and definitions}
\label{tab:mesdesc}
\end{table}

\subsection{Classification methods}
\subsubsection{Linear Discriminant Analysis}
Linear Discriminant Analysis (LDA) is the oldest and best known method of supervised classification, developed by R. Fisher \cite{Fisher36}.

\textit{Geometric rules}. --- Let $X$ be the $(n \times p)$ matrix of the $n$ individuals described by $p$ (beforehand centered) variables $X_1, \dots, X_p$. Fisher's LDA searches for the linear combination $C = Xu$, where $u \in \rr^n$, which admits a minimal within-groups variance and a maximal between-groups variance.

$C$ is centered, so its variance is given by :
\[ \sigma^2(C) = \frac{1}{n} C^\top C = u^\top \left[\frac{1}{n} X^\top X \right]u = u^\top V u,  \]

where $V = X^\top X$  is the total variance-covariance matrix.

According to the Huyghens theorem for covariance, $V$ can be written $V = W + B$, where $W$ and $B$ are respectively the within-groups and between-groups variance-covariance matrices. So, $\sigma^2(C) = u^\top W u + u^\top B u$.

The vector $u$ is chosen to maximize the contribution of between-groups variances in the total variance, \textit{i.e.} it maximizes the expression $\dfrac{u^\top B u}{u^\top V u}$, or equivalently, it maximizes ${u^\top B u}$ under the constraint ${u^\top V u} =1$.

It is a classical case of constrained optimization, and the solution $u$ is given by the eigenvector associated to the largest eigenvalue of $V^{-1}B$.

A score $S(i)$ is then defined for each individual $x_i \in \rr^p$ by the result of its projection on $u$ : $S(i) = x_i \cdot u = \sum_{j=1}^p x_{ij} u(j)$. The subject is classified into the group for which it gets the highest score.

\textit{Bayesian rules}. --- A bayesian probabilistic approach of LDA provides posterior class probabilities. In a binary classification problem, let $p_1, p_2$ be the prior probabilities of each group in the population. We suppose that the probability distribution of each group $i = 1,2$ is the density function $x \mapsto P(x|i)$ --- generally a gaussian distribution fitted by maximum likelihood. So, the posterior probability that an undetermined individual $x$ belongs to the group $i$ is given by the Bayes formula:
\[\pp{i | x} = \frac{P(x|i)\, \cdot\, p_i}{\sum_{k=1,2} P(x|k)\, \cdot \, p_k} \]

For a complete presentation of LDA, see \cite{Morrison03}.

\subsubsection{Classification Trees}
Classification Trees (CT) are a class of recursive partitioning methods: they predict the value of a target variable using a succession of segmentations of the data set based on the input variables. 

Numerous algorithms exist to build a decision tree (like CART, CHAID, ID3, QUEST, ...), but we only used CART (Classification And Regression Trees, see \cite{Breinman84}) in this paper.

The algorithm begins from the root node of the tree, composed of the complete sample. Then, the algorithm proceeds iteratively : when being at a given node, two children nodes are produced splitting the individuals of this node, according to a threshold value of the predictor optimizing a discrimination criterion. At the end, each terminal leaf represents a classification value, given the values the subject takes for the predictors present from the root to the leaf.

At each node $t$, the best predictor is chosen so that a split based on one of its values minimizes the following quantity:
\[g(t) - p_L g(t_L) - p_R g(t_R) \]
where :
\begin{itemize}[$\bullet$]
\item $p_L$ is the proportion of individuals of a given sex (say the females) which will be present to the left children node, $t_L$ ;
\item $p_R$ is the proportion of individuals the same sex which will be present to the right children node, $t_R$ ;
\item $g(t)$ is the Gini impurity index at node $t$, defined as $g(t) = 1 - \sum_{j \in \left\{F,M \right\}} p^2(j|t)$. In this formula, $p(j|t)$ is the (empirical) conditional probability of being of sex $j$ given the node $t$.
\end{itemize}

The tree is grown until all the predictors are used, or until no split can minimize the impurity index of the leaves anymore.

The optimal size of CT (\textit{i.e.}, the optimal number of terminal leaves to retain) is classically determined by a cross-validation process.

Supplemental details about CT can be found in \cite{Hastie09}.

\subsubsection{Logistic regression}
Logistic regression (LR) models the relationship between a set of continuous or categorical predictors $X = (X_1, \cdots, X_p)$ and a dichotomous outcome variable $Y$.

Here, as $Y$ represents the sex of the individuals, the set of its possible values is $\left\{ F, M \right\}$. It has so a Bernoulli distribution, with $p = \pp{Y = F}$, and $1-p = \pp{Y = M}$. LR builds a model to estimate the outcome Y in function of a given configuration of the predictors $X$, by evaluating the following posterior probability of being a female individual:
\[ p(x_1, \cdots, x_p) := \pp{Y=F \, | \, X_1=x_1, \cdots, X_p=x_p}\]

These probabilities are modeled as follows, where the coefficients $\beta_0, \beta_1, \cdots, \beta_p$ are fitted by maximum likelihood:
\[ p(x_1, \cdots, x_p) = \frac{\exp(\beta_0 + \beta_1 x_1 + \cdots + \beta_p x_p)}{1 + \exp(\beta_0 + \beta_1 x_1 + \cdots + \beta_p x_p)}\]

In a very natural way, an individual defined by a profile $(X_1=x_1, \cdots, X_p=x_p)$ will be classified as a female if the above probability is greater than 0.5 (or any other classic threshold value, like 0.9 or 0.95).

For a complete presentation of LR, see \cite{Kleinbaum10}.

\subsubsection{Support Vector Machines}
Support Vector Machines (SVM) are non-probabilistic (binary, in their standard form) classifiers, which aim to find a linear separating hyperplane maximizing the distance to the nearest individuals of each of the two classes, called \textit{margin}.

This margin-maximization principle is justified by the Vapnik-Chervonenkis theory (\cite{Vapnik10}), which proves that the largest margin leads to a minimal error of classification.

In most cases, the two groups are not linearly separable in the original space, and the data is re-described in a higher-dimension (possibly infinite) space using a non-linear function $\phi$, in which a linear separation is possible. 

So, the best separating hyperplane in the redescription space is of the general form $w^\top \phi(x) + w_0 = 0$, where $x = (x_1, \cdots, x_p)$ represents the coordinates of the an individual, $w \in \rr^p$ is a vector of weights, $w_0 \in \rr^p$ is the intercept. 

This hyperplane defines a discrimination function $h(x) = w^\top \phi(x) + w_0$, such that an individual  a given sex (say a female) if $h(x) > 0$, and of the other sex if $h(x) < 0$.

In practice, we do not define directly the function $\phi$ : as the calculus of scalar products $\tr{\phi(x_i)} \cdot \phi(x_j)$, which are necessary to find the best hyperplane, would be time-consuming in the high-dimension space, we work out them using a single-point evaluation of a predefined kernel function $K(x_i, x_j)$ instead. Any symmetric and positive semi-definite function $K$ is suitable, but in most cases a gaussian kernel is used to redescribe the data:
\[K(x_i, x_j) = \exp \left( -  \frac{\| x_i - x_j\|^2}{2 \sigma^2} \right) \]

Another frequent choice is a polynomial kernel :
\[K(x_i, x_j) = \left( \tr{x_i} \cdot x_j +1 \right)^d \]

The choice of one kernel or another may involve a significant difference of performance for each particular problem.

Finally, standard SVM has a major drawback: it does not lead directly to the estimation of posterior class probabilities. Finding a probabilistic framework for SVM remains an active topic of statistical research \cite{Franc11}. We used here the solution proposed by Platt in 2000 \cite{Platt00}, consisting of the approximation of the posterior probabilities by a sigmoid function.

A lot of technical details about standard SVM can be found in \cite{Izenman08}.

\subsection{Case study application}
\subsubsection{Design of the study}
The four populations in the data set are ranked as follows:
\begin{itemize}
\item the Portuguese sample (hereinafter \enquote{POR}) is the largest one and has the great advantage to have a perfectly balanced sex-ratio, so it is considered as our basic training set, on which each of the four methods will be calibrated;
\item a first validation is performed on the French population (\enquote{FRA}) to assess the portability of the models learned on POR within the European continent;
\item the last two populations, American (\enquote{USA}) and Thai (\enquote{THA}), are also considered as test sets to assess the portability of our models on non-Europeans populations.
\end{itemize}

There is an additional difficulty: some of the 16 variables defined in our study could not be measured in all the populations. 14 variables are correctly filled in a subset composed of the populations POR, FRA and THA ; but only 7 could be measured simultaneously in our four populations (see Table \ref{var}).

\begin{table}[h]
\sf
\begin{center}
\begin{tabular}{|c||c|c|c|c|}
\hline 
 & POR & FRA & THA & USA \\ 
\hline
\hline 
M1GOL & $\star$ & $\star$ & $\star$ & $\star$ \\ 
\hline 
M8XCB & $\star$ & $\star$ & $\star$ & $\star$ \\ 
\hline 
M9WFB & $\star$ & $\star$ & $\star$ & $\star$ \\ 
\hline 
M45ZYB & $\star$ & $\star$ & $\star$ & $\star$ \\ 
\hline 
M12ASB & $\star$ & $\star$ &  & $\star$ \\ 
\hline 
M5BNL & $\star$ & $\star$ & $\star$ & $\star$ \\ 
\hline 
M17BBH & $\star$ & $\star$ & $\star$ & $\star$ \\ 
\hline 
M40BPL & $\star$ & $\star$ & $\star$ &  \\ 
\hline 
M48NPH & $\star$ & $\star$ & $\star$ & $\star$ \\ 
\hline 
M61MAB & $\star$ & $\star$ & $\star$ &  \\ 
\hline 
M52OBH & $\star$ &  & $\star$ &  \\ 
\hline 
M54NLB & $\star$ & $\star$ & $\star$ &  \\ 
\hline 
M29FRC & $\star$ & $\star$ & $\star$ &  \\ 
\hline 
M30PAC & $\star$ & $\star$ & $\star$ &  \\ 
\hline 
M31OCC & $\star$ & $\star$ & $\star$ &  \\ 
\hline 
M11AUB & $\star$ & $\star$ & $\star$ &  \\ 
\hline 
\end{tabular} 
\end{center}
\caption{List of the variables which could be measured for each population: a $\star$ indicates that the measurement could be done.}
\label{var}
\rm
\end{table}
 
Consequently, the study will be divided into two experiments:
\begin{enumerate}
\item Using only the 7 variables measured in the whole dataset, the four methods are compared without any process of variable selection. It will give us a first indication of the \enquote{raw performance} of each method.
\item We exclude the US population and use the 14 variables measured in the three others. As the number of variables is getting important, this time we will try to optimize our methods thanks to a process of variable selection (see section \ref{optim}). As it can be quite tricky or time-consuming (especially for SVM), we will be able to assess the real interest of variable selection for each method.
\end{enumerate}

Finally, multivariate normality tests were performed in each population and each group (males and females). None of the distributions showed a significant deviation from normality (at the level 0.05).

All the statistical analyses in our study were performed with R 2.14.1 \cite{Rsoft}.

\subsubsection{Classifiers settings and variable selection}
% Ici, décrire (en vrac, et à ordonner) :
% - le processus de validation croisée sur POR ; --> OK
% - la grille d'optimisation pour SVM, l'élagage de l'arbre ; --> OK
% - la sélection de variables en LRA, SVM et LDA ;
\label{optim}
\textit{Cross validation}. --- As the Portuguese sample is our basic training set, we first used a leave one out cross-validation (LOOCV) process to evaluate the accuracy of the different methods for this population, and to find the optimal settings for the parameters of the models.

\medskip

\textit{Classifiers settings}. --- The kernel used in the SVM was a gaussian function. It requires to optimize two parameters in order to get the best performance : the cost ($c$), and a variance-related parameter $\gamma = 1/\sigma^2$. To determine the optimal values of these parameters, one usually define a grid search $\mathcal{L} = \left\{(2^k, 2^l) \, | \, k \in \llbracket -5, 15 \rrbracket, \, l \in \llbracket -15, 5 \rrbracket \right\}$, and compare the accuracies obtained by LOOCV for each couple of parameters $(c, \gamma) \in \mathcal{L}$. The grid is classically defined by powers of 2 (our choice) or 10 \cite{Izenman08}. The couple of parameters leading to the best precision is finally retained. All this procedure was performed with R, using the e1071 package \cite{Meyer01}.

\medskip

\textit{Variable selection}. --- Variable selection in LR was performed using a classical stepwise procedure. 
In LDA, we used a very simple and easy to handle procedure of variable selection, consisting of removing the predictors whose linear coefficients seemed to have only a tiny impact on the score function (\textit{i.e.}, predictors with a coefficient near to 0). This naive but rather efficient approach, commonly used by students and workers in anthropology, is to be compared to the much more complicated approach for variable selection in SVM.
Variable selection is not supposed to be mandatory in SVM, but some recent studies show that SVM accuracy can actually be significantly improved by selecting only the best predictors \cite{Rakoto04}. 

We implemented in R the SVM-RFE algorithm for variable selection in SVM, developed by Guyon \textit{et al.} \cite{Guyon02}, which allows to rank the predictors by decreasing order of discriminant power. The user must then chose how many predictors he wants to retain. An exhaustive (and time-consuming) comparison of the accuracy by LOOCV of the models with only the best predictor, with the two best predictors, ..., and all the predictors, can be performed to determine the optimal number of variables to retain.

\subsubsection{Comparison of the results}
% Ici, décrire (en vrac, et à ordonner) :
% - la comparaison des précisions, les indicateurs et tests utilisés ;
% - les 95%, 90% et 50% de confiance (lorsque ça a un sens).
[Coming soon.] 

\section{Results}
\subsection{With all the populations and without variable selection}
[Totalement effectu\'e informatiquement, la description litt\'eraire est \`a venir.]

\medskip

See Table \ref{tab:95avecUS}.

\begin{table*}[ht]
\fbox{
\begin{minipage}[c]{\textwidth}
\centering
\sf
\begin{tabular}{|c||c|c|c|c|c|c|c|c|}
\hline
& \multicolumn{2}{|c|}{CT} & \multicolumn{2}{|c|}{LDA} & \multicolumn{2}{|c|}{LR} & \multicolumn{2}{|c|}{SVM} \\
\hline
& \% det. & \% accu. & \% det. & \% accu. &\% det. & \% accu. &\% det. & \% accu. \\
\hline
\hline
POR & 0 & -- & 47.5 & \textbf{100} & \textbf{58.2} & 96.7 & 25.2 & \textbf{100} \\
\hline
FRA & 0 & -- & 52 & \textbf{92.3} & \textbf{62} & 90.3 & 36 & 88.9 \\
\hline
US & 0 & -- & 63.3 & 92.1 & \textbf{68.3} & 92.7 & 43.3 & \textbf{100} \\
\hline
THA & 0 & -- & 52.7 & \textbf{70.8} & \textbf{63.7} & 63.8 & 32.9 & 66.7  \\
\hline

\end{tabular}
\caption{Results of the classification with a threshold value of 0.95 : only the individuals with a posterior probability greater than 0.95 were taken into account for the calculus of accuracy (\enquote{\% accu.}). The percentage of individuals who reached this threshold value is given by \enquote{\% det.}. The others individuals were excluded.}
\label{tab:95avecUS}
\end{minipage}
}
\rm
\end{table*}

\subsection{Without USA population and with variable selection}
[En cours de codage informatique.]

See Table \ref{tab:95sansUSsansVS} for the results without variable selection, and Table \ref{tab:95sansUSavecVS} for the results with variable selection.

\begin{table*}[ht]
\fbox{
\begin{minipage}[c]{\textwidth}
\centering
\sf
\begin{tabular}{|c||c|c|c|c|c|c|c|c|}
\hline
& \multicolumn{2}{|c|}{CT} & \multicolumn{2}{|c|}{LDA} & \multicolumn{2}{|c|}{LR} & \multicolumn{2}{|c|}{SVM} \\
\hline
& \% det. & \% accu. & \% det. & \% accu. &\% det. & \% accu. &\% det. & \% accu. \\
\hline
\hline
POR & 0 & -- & 54 & 96.3 & \textbf{66} & 94 & 29 & \textbf{100} \\
\hline
FRA & 0 & -- & 54 & 92.6 & \textbf{72} & 86.1 & 20 & \textbf{100} \\
\hline
THA & 0 & -- & 67 & 54.7 & \textbf{63.7} & 77.2 & 0 & --  \\
\hline

\end{tabular}
\caption{Results of the classification with a threshold value of 0.95 and without variable selection.}
\label{tab:95sansUSsansVS}
\end{minipage}
}
\rm
\end{table*}


\begin{table*}[ht]
\fbox{
\begin{minipage}[c]{\textwidth}
\centering
\sf
\begin{tabular}{|c||c|c|c|c|c|c|c|c|}
\hline
& \multicolumn{2}{|c|}{CT} & \multicolumn{2}{|c|}{LDA} & \multicolumn{2}{|c|}{LR} & \multicolumn{2}{|c|}{SVM} \\
\hline
& \% det. & \% accu. & \% det. & \% accu. &\% det. & \% accu. &\% det. & \% accu. \\
\hline
\hline
POR & 0 & -- & 50 & 98 & \textbf{66} & 95.9 & 47 & 95.8 \\
\hline
FRA & 0 & -- & 56 & 92.9 & 54 & 88.5 & 16 & \textbf{100} \\
\hline
THA & 0 & -- & 66 & 59.5 & 49.4 & 56.4 & 7.7 & \textbf{71.4} \\
\hline

\end{tabular}
\caption{Results of the classification with a threshold value of 0.95 and with variable selection.}
\label{tab:95sansUSavecVS}
\end{minipage}
}
\rm
\end{table*}

\section{Discussion}
[Coming soon.] 

\section*{References}
\bibliographystyle{model1-num-names}
\bibliography{biblio}


\end{document}
